import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import librosa as librosa
import librosa.display
import os
# sets initial plot parameters for all of our plotting in this notebook
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 5))
plt.rc(
"axes",
labelweight="bold",
labelsize="large",
titleweight="bold",
titlesize=14,
titlepad=10,
)
plot_params = dict(
color="0.75",
style=".-",
markeredgecolor="0.25",
markerfacecolor="0.25",
legend=False,
)
df=pd.read_csv('note_info.csv') # imports audio sample information dataframe
df.head()
| Unnamed: 0 | note_str | sample_rate | qualities_str | instrument_source | instrument_family_str | instrument_family | note | instrument_source_str | qualities | pitch | instrument_str | instrument | velocity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | keyboard_acoustic_004-060-025 | keyboard_acoustic_004-060-025 | 16000 | ['dark', 'reverb'] | 0 | keyboard | 4 | 278915 | acoustic | [0, 1, 0, 0, 0, 0, 0, 0, 1, 0] | 60 | keyboard_acoustic_004 | 327 | 25 |
| 1 | bass_synthetic_033-050-100 | bass_synthetic_033-050-100 | 16000 | ['dark'] | 2 | bass | 0 | 270361 | synthetic | [0, 1, 0, 0, 0, 0, 0, 0, 0, 0] | 50 | bass_synthetic_033 | 417 | 100 |
| 2 | bass_synthetic_009-052-050 | bass_synthetic_009-052-050 | 16000 | ['bright', 'distortion', 'long_release'] | 2 | bass | 0 | 270001 | synthetic | [1, 0, 1, 0, 1, 0, 0, 0, 0, 0] | 52 | bass_synthetic_009 | 150 | 50 |
| 3 | keyboard_electronic_003-064-127 | keyboard_electronic_003-064-127 | 16000 | [] | 1 | keyboard | 4 | 50978 | electronic | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] | 64 | keyboard_electronic_003 | 65 | 127 |
| 4 | bass_synthetic_034-030-050 | bass_synthetic_034-030-050 | 16000 | ['distortion', 'tempo-synced'] | 2 | bass | 0 | 265159 | synthetic | [0, 0, 1, 0, 0, 0, 0, 0, 0, 1] | 30 | bass_synthetic_034 | 420 | 50 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 12678 entries, 0 to 12677 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 12678 non-null object 1 note_str 12678 non-null object 2 sample_rate 12678 non-null int64 3 qualities_str 12678 non-null object 4 instrument_source 12678 non-null int64 5 instrument_family_str 12678 non-null object 6 instrument_family 12678 non-null int64 7 note 12678 non-null int64 8 instrument_source_str 12678 non-null object 9 qualities 12678 non-null object 10 pitch 12678 non-null int64 11 instrument_str 12678 non-null object 12 instrument 12678 non-null int64 13 velocity 12678 non-null int64 dtypes: int64(7), object(7) memory usage: 1.4+ MB
len(df['pitch'].unique()) # identifies number of unique pitches
112
# defines function to extract various features from audio files. extraction functions provided by gist.github.com/gvyshnya
def extract_feature_means(audio_file_path: str) -> pd.DataFrame:
# config settings
number_of_mfcc = 20
y, sr = librosa.load(audio_file_path)
signal, _ = librosa.effects.trim(y)
n_fft = 512 # FFT window size
hop_length = 256 # number audio of frames between STFT columns (looks like a good default)
# Short-time Fourier transform (STFT)
d_audio = np.abs(librosa.stft(signal, n_fft=n_fft, hop_length=hop_length))
# 3. Spectrogram
# Convert an amplitude spectrogram to Decibels-scaled spectrogram.
db_audio = librosa.amplitude_to_db(d_audio, ref=np.max)
# 4. Create the Mel Spectrograms
s_audio = librosa.feature.melspectrogram(signal, sr=sr)
s_db_audio = librosa.amplitude_to_db(s_audio, ref=np.max)
# 5 Zero crossings
# #6. Harmonics and Perceptrual
# Note:
#
# Harmonics are characteristichs that represent the sound color
# Perceptrual shock wave represents the sound rhythm and emotion
y_harm, y_perc = librosa.effects.hpss(signal)
# 7. Spectral Centroid
# Note: Indicates where the ”centre of mass” for a sound is located and is calculated
# as the weighted mean of the frequencies present in the sound.
# Calculate the Spectral Centroids
spectral_centroids = librosa.feature.spectral_centroid(signal, sr=sr)[0]
spectral_centroids_delta = librosa.feature.delta(spectral_centroids)
spectral_centroids_accelerate = librosa.feature.delta(spectral_centroids, order=2)
# spectral_centroid_feats = np.stack((spectral_centroids, delta, accelerate)) # (3, 64, xx)
# 8. Chroma Frequencies¶
# Note: Chroma features are an interesting and powerful representation
# for music audio in which the entire spectrum is projected onto 12 bins
# representing the 12 distinct semitones ( or chromas) of the musical octave.
# Increase or decrease hop_length to change how granular you want your data to be
hop_length = 256
# Chromogram
chromagram = librosa.feature.chroma_stft(signal, sr=sr, hop_length=hop_length)
# 9. Tempo BPM (beats per minute)¶
# Note: Dynamic programming beat tracker.
# Create Tempo BPM variable
tempo_y, _ = librosa.beat.beat_track(signal, sr=sr)
# 10. Spectral Rolloff
# Note: Is a measure of the shape of the signal. It represents the frequency below which a specified
# percentage of the total spectral energy(e.g. 85 %) lies.
# Spectral RollOff Vector
spectral_rolloff = librosa.feature.spectral_rolloff(signal, sr=sr)[0]
# spectral flux
onset_env = librosa.onset.onset_strength(y=signal, sr=sr)
# Spectral Bandwidth¶
# The spectral bandwidth is defined as the width of the band of light at one-half the peak
# maximum (or full width at half maximum [FWHM]) and is represented by the two vertical
# red lines and λSB on the wavelength axis.
spectral_bandwidth_2 = librosa.feature.spectral_bandwidth(signal, sr=sr)[0]
spectral_bandwidth_3 = librosa.feature.spectral_bandwidth(signal, sr=sr, p=3)[0]
spectral_bandwidth_4 = librosa.feature.spectral_bandwidth(signal, sr=sr, p=4)[0]
audio_features = {
"file_name": audio_file_path,
"zero_crossing_rate": np.mean(librosa.feature.zero_crossing_rate(signal)[0]),
"zero_crossings": np.sum(librosa.zero_crossings(signal, pad=False)),
"spectrogram": np.mean(db_audio[0]),
"mel_spectrogram": np.mean(s_db_audio[0]),
"harmonics": np.mean(y_harm),
"perceptual_shock_wave": np.mean(y_perc),
"spectral_centroids": np.mean(spectral_centroids),
"spectral_centroids_delta": np.mean(spectral_centroids_delta),
"spectral_centroids_accelerate": np.mean(spectral_centroids_accelerate),
"chroma1": np.mean(chromagram[0]),
"chroma2": np.mean(chromagram[1]),
"chroma3": np.mean(chromagram[2]),
"chroma4": np.mean(chromagram[3]),
"chroma5": np.mean(chromagram[4]),
"chroma6": np.mean(chromagram[5]),
"chroma7": np.mean(chromagram[6]),
"chroma8": np.mean(chromagram[7]),
"chroma9": np.mean(chromagram[8]),
"chroma10": np.mean(chromagram[9]),
"chroma11": np.mean(chromagram[10]),
"chroma12": np.mean(chromagram[11]),
"tempo_bpm": tempo_y,
"spectral_rolloff": np.mean(spectral_rolloff),
"spectral_flux": np.mean(onset_env),
"spectral_bandwidth_2": np.mean(spectral_bandwidth_2),
"spectral_bandwidth_3": np.mean(spectral_bandwidth_3),
"spectral_bandwidth_4": np.mean(spectral_bandwidth_4),
}
# extract mfcc feature
mfcc_df = extract_mfcc_feature_means(audio_file_path,
signal,
sample_rate=sr,
number_of_mfcc=number_of_mfcc)
df = pd.DataFrame.from_records(data=[audio_features])
df = pd.merge(df, mfcc_df, on='file_name')
return df
# librosa.feature.mfcc(signal)[0, 0]
def extract_mfcc_feature_means(audio_file_name: str,
signal: np.ndarray,
sample_rate: int,
number_of_mfcc: int) -> pd.DataFrame:
mfcc_alt = librosa.feature.mfcc(y=signal, sr=sample_rate,
n_mfcc=number_of_mfcc)
delta = librosa.feature.delta(mfcc_alt)
accelerate = librosa.feature.delta(mfcc_alt, order=2)
mfcc_features = {
"file_name": audio_file_name,
}
for i in range(0, number_of_mfcc):
# dict.update({'key3': 'geeks'})
# mfcc coefficient
key_name = "".join(['mfcc', str(i)])
mfcc_value = np.mean(mfcc_alt[i])
mfcc_features.update({key_name: mfcc_value})
# mfcc delta coefficient
key_name = "".join(['mfcc_delta_', str(i)])
mfcc_value = np.mean(delta[i])
mfcc_features.update({key_name: mfcc_value})
# mfcc accelerate coefficient
key_name = "".join(['mfcc_accelerate_', str(i)])
mfcc_value = np.mean(accelerate[i])
mfcc_features.update({key_name: mfcc_value})
df = pd.DataFrame.from_records(data=[mfcc_features])
return df
path = r"C:\Users\ksivi\Desktop\New folder\nsynth-valid\audio" # defines path for audio files
dir_list = os.listdir(path) # creates list of files found in path
len(dir_list)
12676
# extracts features from audio files, appends to 'info' list
info = []
for i in dir_list:
try:
data = extract_feature_means("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+i)
except:
continue
values = data.values
info.append(values[0])
df_2 = extract_feature_means('bass_electronic_018-022-025.wav') # extracts features from single audio file
df_2.head()
| file_name | zero_crossing_rate | zero_crossings | spectrogram | mel_spectrogram | harmonics | perceptual_shock_wave | spectral_centroids | spectral_centroids_delta | spectral_centroids_accelerate | ... | mfcc_accelerate_16 | mfcc17 | mfcc_delta_17 | mfcc_accelerate_17 | mfcc18 | mfcc_delta_18 | mfcc_accelerate_18 | mfcc19 | mfcc_delta_19 | mfcc_accelerate_19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | bass_electronic_018-022-025.wav | 0.210125 | 15742 | -66.413757 | -77.642357 | 0.000006 | -0.000673 | 2671.302747 | 22.499443 | 1.72652 | ... | 0.046337 | -0.739872 | -0.046959 | 0.051725 | -0.290447 | -0.08219 | 0.018868 | 2.694299 | -0.056343 | -0.000569 |
1 rows × 88 columns
columns = df_2.columns # pulls columns list from df_2
va = pd.DataFrame(data=info, columns = columns) # creates dataframe from extracted audio file features
# converts file name from full path to file, removes type(.wav)
for i in range(len(va)):
va['file_name'][i] = va['file_name'][i][53:-4]
df['note_str'] = df['note_str'].astype(str) # converts note type to string
samples = df.merge(va,left_on='note_str', right_on='file_name',how='right') # combines extracted audio features with audio sample information dataframe
samples['instrument_family_str'].value_counts() # views count of unique instruments
bass 2635 keyboard 2402 guitar 2070 organ 1598 brass 886 string 814 reed 720 mallet 663 flute 470 vocal 404 Name: instrument_family_str, dtype: int64
len(samples)
12662
# plots wave shape of audio file
plt.figure(figsize=(20,4))
x, sr = librosa.load("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+dir_list[8512])
librosa.display.waveshow(y=x,sr=sr)
plt.title("Mallet Audio Wave")
plt.xlim(-0.1,2);
plt.savefig('mallet_wave.png')
# plots spectrogram (frequency vs time) of audio file
y, sr = librosa.load("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+dir_list[455])
fig, ax = plt.subplots(nrows=2, ncols=1, sharex=True)
D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
img = librosa.display.specshow(D, y_axis='linear', x_axis='time',sr=sr, ax=ax[0])
ax[0].set(title=dir_list[455])
ax[0].label_outer()
hop_length = 1024
D = librosa.amplitude_to_db(np.abs(librosa.stft(y, hop_length=hop_length)), ref=np.max)
librosa.display.specshow(D, y_axis='log', sr=sr, hop_length=hop_length,x_axis='time', ax=ax[1])
ax[1].set(title='Log-frequency power spectrogram')
ax[1].label_outer()
fig.colorbar(img, ax=ax, format="%+2.f dB", anchor =(3,1))
ax[0].set_xlim(0,2)
ax[1].set_xlim(0,2);
plt.savefig('bass_spec.png')
# plots spectrogram (frequency vs time) of audio files
y1, sr1 = librosa.load("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+dir_list[4002])
y2, sr2 = librosa.load("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+dir_list[455])
fig, axs = plt.subplots(nrows=2, ncols=2, sharex=False, figsize=(20,8))
fig.tight_layout(pad=5)
D1 = librosa.amplitude_to_db(np.abs(librosa.stft(y1)), ref=np.max)
img1 = librosa.display.specshow(D1, y_axis='linear', x_axis='time',sr=sr1, ax=axs[0,0])
D2 = librosa.amplitude_to_db(np.abs(librosa.stft(y2)), ref=np.max)
img2 = librosa.display.specshow(D2, y_axis='linear', x_axis='time',sr=sr2, ax=axs[0,1])
axs[0,0].set_title(dir_list[4002], fontsize=20)
axs[0,1].set_title(dir_list[455], fontsize=20)
hop_length = 1024
D1 = librosa.amplitude_to_db(np.abs(librosa.stft(y1, hop_length=hop_length)), ref=np.max)
librosa.display.specshow(D1, y_axis='log', sr=sr1, hop_length=hop_length,x_axis='time', ax=axs[1,0])
D2 = librosa.amplitude_to_db(np.abs(librosa.stft(y2, hop_length=hop_length)), ref=np.max)
librosa.display.specshow(D2, y_axis='log', sr=sr2, hop_length=hop_length,x_axis='time', ax=axs[1,1])
axs[1,0].set(title='Log-frequency power spectrogram')
axs[1,1].set(title='Log-frequency power spectrogram')
fig.colorbar(img, ax=axs, format="%+2.f dB",anchor =(2,1))
plt.setp(axs,xlim=(0,2));
plt.savefig('guitar_bass_spec.png')
# plots spectrogram (frequency vs time) of audio files
y3, sr3 = librosa.load("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+dir_list[8512])
y4, sr4 = librosa.load("C:/Users/ksivi/Desktop/New folder/nsynth-valid/audio/"+dir_list[7481])
fig, axs = plt.subplots(nrows=2, ncols=2, sharex=False, figsize=(20,8))
fig.tight_layout(pad=5)
D1 = librosa.amplitude_to_db(np.abs(librosa.stft(y3)), ref=np.max)
img1 = librosa.display.specshow(D1, y_axis='linear', x_axis='time',sr=sr3, ax=axs[0,0])
D2 = librosa.amplitude_to_db(np.abs(librosa.stft(y4)), ref=np.max)
img2 = librosa.display.specshow(D2, y_axis='linear', x_axis='time',sr=sr4, ax=axs[0,1])
axs[0,0].set_title(dir_list[8512], fontsize=20)
axs[0,1].set_title(dir_list[7481],fontsize=20)
hop_length = 1024
D3 = librosa.amplitude_to_db(np.abs(librosa.stft(y3, hop_length=hop_length)), ref=np.max)
librosa.display.specshow(D3, y_axis='log', sr=sr3, hop_length=hop_length,x_axis='time', ax=axs[1,0])
D4 = librosa.amplitude_to_db(np.abs(librosa.stft(y4, hop_length=hop_length)), ref=np.max)
librosa.display.specshow(D4, y_axis='log', sr=sr4, hop_length=hop_length,x_axis='time', ax=axs[1,1])
axs[1,0].set(title='Log-frequency power spectrogram')
axs[1,1].set(title='Log-frequency power spectrogram')
fig.colorbar(img, ax=axs, format="%+2.f dB", anchor =(2,1))
plt.setp(axs,xlim=(0,2));
plt.savefig('mallet_keyboard_spec.png')
# plots scatterplot of 2 audio features (spectral centriod and zero cross rate) with the hue representing pitch
plt.figure(figsize=(16,16), dpi=200)
sns.scatterplot(data=samples,x='spectral_centroids', y='zero_crossing_rate', hue = 'pitch',palette ='viridis')
plt.title('Zero Crossing Rate vs Spetral Centroids',fontdict={'fontsize': 24})
plt.xlabel('Spectral Centroids')
plt.ylabel('Zero Crossing Rate')
plt.savefig('cent_zero_cross_scatter.png')
# creates barplot of MFCC 3 by instrument
plt.figure(figsize=(10,6), dpi=200)
sns.barplot(data=samples, x='instrument_family_str', y= 'mfcc3', palette = 'mako')
plt.xlabel('Instrument')
plt.ylabel('MFCC 3');
plt.title('MFCC 3 by Instrument')
plt.savefig('mfcc3_inst.png')
# creates barplot of MFCC 8 by instrument
plt.figure(figsize=(10,6), dpi=200)
sns.barplot(data=samples, x='instrument_family_str', y= 'mfcc8', palette = 'mako')
plt.xlabel('Instrument')
plt.title('MFCC 8 by Instrument')
plt.ylabel('MFCC 8');
plt.savefig('mfcc8_inst.png')
# creates barplot of spectral bandwidth by instrument
plt.figure(figsize=(10,6), dpi=200)
sns.barplot(data=samples, x='instrument_family_str', y= 'spectral_bandwidth_2', palette = 'mako')
plt.xlabel('Instrument')
plt.ylabel('spectral_bandwidth')
plt.title('Spectral Bandwidth by Instrument');
plt.savefig('spec_band.png')
#samples.to_csv('samples.csv')
# creates plot showing relationship between various audio features with the hue representing pitch
g = sns.pairplot(data=samples, vars = ['mfcc2','mfcc3','mfcc4','mfcc5','mfcc6','mfcc7','spectral_bandwidth_3'],hue='pitch', palette='viridis')
g.fig.suptitle("Pair Plot of Various Audio Features by Pitch", fontsize = 24, weight = 'bold', y = 1.02)
Text(0.5, 1.02, 'Pair Plot of Various Audio Features by Pitch')
samples.head()
| Unnamed: 0 | note_str | sample_rate | qualities_str | instrument_source | instrument_family_str | instrument_family | note | instrument_source_str | qualities | ... | mfcc_accelerate_16 | mfcc17 | mfcc_delta_17 | mfcc_accelerate_17 | mfcc18 | mfcc_delta_18 | mfcc_accelerate_18 | mfcc19 | mfcc_delta_19 | mfcc_accelerate_19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | bass_electronic_018-022-050 | bass_electronic_018-022-050 | 16000 | ['percussive'] | 1 | bass | 0 | 277009 | electronic | [0, 0, 0, 0, 0, 0, 0, 1, 0, 0] | ... | 0.046337 | -0.739872 | -0.046959 | 0.051725 | -0.290447 | -0.082190 | 0.018868 | 2.694299 | -0.056343 | -0.000569 |
| 1 | bass_electronic_018-022-127 | bass_electronic_018-022-127 | 16000 | ['fast_decay', 'percussive'] | 1 | bass | 0 | 223304 | electronic | [0, 0, 0, 1, 0, 0, 0, 1, 0, 0] | ... | -0.275117 | -0.969569 | 0.028864 | -0.215973 | -2.457191 | -0.140676 | 0.338458 | -0.499463 | -0.884398 | 0.581262 |
| 2 | bass_electronic_018-023-050 | bass_electronic_018-023-050 | 16000 | [] | 1 | bass | 0 | 222626 | electronic | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] | ... | 0.013104 | 1.079908 | -0.011627 | 0.000507 | 0.035780 | 0.013179 | 0.005179 | -0.784336 | 0.031024 | -0.011306 |
| 3 | bass_electronic_018-023-100 | bass_electronic_018-023-100 | 16000 | [] | 1 | bass | 0 | 230338 | electronic | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] | ... | 0.006131 | 1.711984 | 0.009815 | 0.006658 | 0.741615 | -0.012899 | 0.003196 | -0.122626 | 0.010039 | -0.027255 |
| 4 | bass_electronic_018-024-050 | bass_electronic_018-024-050 | 16000 | [] | 1 | bass | 0 | 284868 | electronic | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] | ... | 0.010970 | 1.546056 | 0.016031 | -0.002961 | 0.683596 | -0.000470 | 0.006038 | -0.126844 | -0.017591 | -0.018448 |
5 rows × 102 columns
# drops unnecessary columns from samples dataframe
samples = samples.drop(['note_str', 'instrument_source', 'instrument_family','sample_rate', 'qualities_str', 'file_name','instrument_source_str', 'qualities', 'Unnamed: 0','instrument_str', 'instrument','velocity','file_name','note'], axis = 1)
# removes notes where pitch is outside range of a standard keyboard
note=samples[samples['pitch'] > 21]
note=note[note['pitch']<108]
inst = note.drop('pitch', axis=1) # removes pitch for dataframe where instrument is predicting class
note=note.drop('instrument_family_str',axis=1) # removes instrument for dataframe where pitch is predicting class
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
# sets X,y values for instrument prediction
X=inst.drop('instrument_family_str', axis=1)
y=inst['instrument_family_str']
scaler = StandardScaler() # creates instance of standard scaler
# splits dataframe for training / testing
X_train_inst, X_test_inst, y_train_inst, y_test_inst = train_test_split(X, y, test_size=0.15, random_state=101)
scaled_X_train_inst = scaler.fit_transform(X_train_inst) # fits scaler to training data, scales training data
scaled_X_test_inst = scaler.transform(X_test_inst) # scales test data
Logrithmic Regression
log_model=LogisticRegression(solver='saga', multi_class='ovr', max_iter=10000) # defines model to be used for training
# sets up parameters for grid search
penalty = ['l1', 'l2']
C = np.logspace(0, 4, 10)
grid_model = GridSearchCV(log_model,param_grid={'C':C,'penalty':penalty})
grid_model.fit(scaled_X_train_inst,y_train_inst) # finds best parameters for model based on training data
GridSearchCV(estimator=LogisticRegression(max_iter=10000, multi_class='ovr',
solver='saga'),
param_grid={'C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
3.59381366e+03, 1.00000000e+04]),
'penalty': ['l1', 'l2']})
grid_model.best_params_ # displays best parameters from grid search
{'C': 10000.0, 'penalty': 'l2'}
y_preds = grid_model.predict(scaled_X_test_inst) # predicts instrument name
from sklearn.metrics import classification_report, plot_confusion_matrix
print(classification_report(y_preds,y_test_inst)) # displays classification report
precision recall f1-score support
bass 0.85 0.79 0.82 407
brass 0.91 0.81 0.86 143
flute 0.71 0.79 0.75 78
guitar 0.75 0.67 0.71 316
keyboard 0.72 0.78 0.75 335
mallet 0.65 0.86 0.74 76
organ 0.90 0.91 0.91 228
reed 0.93 0.96 0.94 95
string 0.81 0.86 0.83 128
vocal 1.00 1.00 1.00 50
accuracy 0.81 1856
macro avg 0.82 0.84 0.83 1856
weighted avg 0.81 0.81 0.81 1856
# plots confusion matrix for instrument name
plt.figure(figsize=(14,14))
plot_confusion_matrix(grid_model,scaled_X_test_inst,y_test_inst)
plt.xticks(rotation=90);
<Figure size 1008x1008 with 0 Axes>
K Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# determines error rate for KNN model with various K values to determine optimal K for prediction
test_error_rates = []
for k in range(1,100):
knn_model = KNeighborsClassifier(n_neighbors=k)
knn_model.fit(scaled_X_train_inst,y_train_inst)
y_pred_test = knn_model.predict(scaled_X_test_inst)
test_error = 1 - accuracy_score(y_test_inst,y_pred_test)
test_error_rates.append(test_error)
# plots K value vs error rate
plt.figure(figsize=(10,6),dpi=200)
plt.plot(range(1,100),test_error_rates,label='Test Error')
plt.legend()
plt.ylabel('Error Rate')
plt.xlabel("K Value")
plt.title('K Value vs Error Rate');
plt.savefig('k_val_inst.jpg')
KNN_model = KNeighborsClassifier(n_neighbors=1) # initiates model with K = 1
KNN_model.fit(scaled_X_train_inst,y_train_inst) # fits model to training data
y_pred_test = KNN_model.predict(scaled_X_test_inst) # predicts instrument name
print(classification_report(y_pred_test,y_test_inst)) # displays classification report for KNN
precision recall f1-score support
bass 0.99 0.95 0.97 397
brass 0.97 0.98 0.98 126
flute 1.00 0.98 0.99 89
guitar 0.95 0.98 0.96 276
keyboard 0.97 0.99 0.98 356
mallet 0.97 0.99 0.98 98
organ 0.97 1.00 0.99 225
reed 0.99 0.99 0.99 98
string 1.00 0.96 0.98 141
vocal 1.00 1.00 1.00 50
accuracy 0.98 1856
macro avg 0.98 0.98 0.98 1856
weighted avg 0.98 0.98 0.98 1856
Random Forest
from sklearn.ensemble import RandomForestClassifier
rand_model = RandomForestClassifier() # initiates random forest model
rand_model.fit(scaled_X_train_inst,y_train_inst) # fits model to training data
RandomForestClassifier()
preds_inst = rand_model.predict(scaled_X_test_inst) # predicts instrument name
print(classification_report(preds_inst,y_test_inst)) # displays classification report for random forest
precision recall f1-score support
bass 1.00 0.99 1.00 383
brass 1.00 1.00 1.00 128
flute 1.00 1.00 1.00 87
guitar 0.99 1.00 0.99 281
keyboard 1.00 1.00 1.00 362
mallet 1.00 1.00 1.00 100
organ 1.00 1.00 1.00 231
reed 1.00 1.00 1.00 98
string 1.00 1.00 1.00 136
vocal 1.00 1.00 1.00 50
accuracy 1.00 1856
macro avg 1.00 1.00 1.00 1856
weighted avg 1.00 1.00 1.00 1856
# displays confusion matrix for random forest
plot_confusion_matrix(rand_model,scaled_X_test_inst,y_test_inst)
plt.xticks(rotation = 90);
note.head()
| pitch | zero_crossing_rate | zero_crossings | spectrogram | mel_spectrogram | harmonics | perceptual_shock_wave | spectral_centroids | spectral_centroids_delta | spectral_centroids_accelerate | ... | mfcc_accelerate_16 | mfcc17 | mfcc_delta_17 | mfcc_accelerate_17 | mfcc18 | mfcc_delta_18 | mfcc_accelerate_18 | mfcc19 | mfcc_delta_19 | mfcc_accelerate_19 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 22 | 0.210125 | 15742 | -66.413757 | -77.642357 | 0.000006 | -0.000673 | 2671.302747 | 22.499443 | 1.726520 | ... | 0.046337 | -0.739872 | -0.046959 | 0.051725 | -0.290447 | -0.082190 | 0.018868 | 2.694299 | -0.056343 | -0.000569 |
| 1 | 22 | 0.062500 | 459 | -35.409962 | -52.084045 | 0.001413 | -0.001678 | 667.165083 | 82.330465 | 83.957916 | ... | -0.275117 | -0.969569 | 0.028864 | -0.215973 | -2.457191 | -0.140676 | 0.338458 | -0.499463 | -0.884398 | 0.581262 |
| 2 | 23 | 0.008180 | 589 | -25.864546 | -30.828018 | 0.000110 | 0.000114 | 129.129320 | 7.923706 | 3.023670 | ... | 0.013104 | 1.079908 | -0.011627 | 0.000507 | 0.035780 | 0.013179 | 0.005179 | -0.784336 | 0.031024 | -0.011306 |
| 3 | 23 | 0.007953 | 562 | -27.519205 | -33.968998 | 0.000027 | 0.000153 | 120.454945 | 5.007528 | 4.103833 | ... | 0.006131 | 1.711984 | 0.009815 | 0.006658 | 0.741615 | -0.012899 | 0.003196 | -0.122626 | 0.010039 | -0.027255 |
| 4 | 24 | 0.009085 | 646 | -26.927103 | -31.652479 | 0.000025 | 0.000088 | 127.136842 | 6.714991 | 3.042447 | ... | 0.010970 | 1.546056 | 0.016031 | -0.002961 | 0.683596 | -0.000470 | 0.006038 | -0.126844 | -0.017591 | -0.018448 |
5 rows × 88 columns
# sets X and y values for note preidiction
X = note.drop('pitch', axis=1)
y=note['pitch']
# splits data in train, test sets
X_train_note, X_test_note, y_train_note, y_test_note = train_test_split(X, y, test_size=0.15, random_state=101)
rand_model2 = RandomForestClassifier() # initiates random forest model
rand_model2.fit(X_train_note,y_train_note) # fits model to training data
RandomForestClassifier()
preds = rand_model2.predict(X_test_note) # predicts pitch
print(classification_report(preds,y_test_note)) # displays classification report random forest
precision recall f1-score support
22 0.67 0.67 0.67 12
23 0.57 0.80 0.67 15
24 0.65 0.79 0.71 19
25 0.84 0.80 0.82 20
26 0.92 0.73 0.81 15
27 0.92 1.00 0.96 24
28 0.73 0.79 0.76 14
29 0.90 0.82 0.86 33
30 1.00 0.65 0.79 20
31 0.90 0.90 0.90 20
32 0.90 0.88 0.89 32
33 0.85 0.81 0.83 27
34 0.96 0.96 0.96 26
35 0.71 0.86 0.77 14
36 0.80 0.73 0.76 22
37 0.93 0.90 0.91 29
38 1.00 0.94 0.97 34
39 0.83 0.96 0.89 25
40 0.92 0.86 0.89 28
41 0.87 0.87 0.87 30
42 0.79 0.88 0.84 26
43 0.88 0.88 0.88 25
44 0.87 0.91 0.89 22
45 0.80 0.91 0.85 22
46 0.93 0.89 0.91 28
47 1.00 0.74 0.85 23
48 0.95 0.86 0.90 21
49 0.82 0.88 0.85 26
50 0.94 1.00 0.97 31
51 0.97 0.97 0.97 34
52 0.94 0.94 0.94 32
53 0.88 0.96 0.92 23
54 0.97 0.97 0.97 32
55 0.97 0.97 0.97 34
56 0.97 0.89 0.93 38
57 0.93 1.00 0.96 26
58 0.80 1.00 0.89 20
59 1.00 0.86 0.92 28
60 0.97 0.91 0.94 35
61 0.96 1.00 0.98 22
62 1.00 0.96 0.98 23
63 1.00 1.00 1.00 28
64 1.00 1.00 1.00 25
65 1.00 1.00 1.00 19
66 0.96 1.00 0.98 23
67 1.00 0.96 0.98 23
68 0.89 1.00 0.94 25
69 1.00 1.00 1.00 18
70 1.00 0.96 0.98 24
71 1.00 0.83 0.91 18
72 1.00 1.00 1.00 20
73 1.00 1.00 1.00 15
74 0.95 1.00 0.97 18
75 1.00 0.92 0.96 25
76 0.96 0.96 0.96 25
77 0.94 1.00 0.97 16
78 1.00 1.00 1.00 19
79 1.00 1.00 1.00 27
80 1.00 0.94 0.97 18
81 1.00 0.96 0.98 24
82 0.94 1.00 0.97 16
83 0.86 0.95 0.90 20
84 1.00 1.00 1.00 25
85 0.88 0.93 0.90 15
86 1.00 0.94 0.97 18
87 1.00 0.95 0.98 22
88 0.92 1.00 0.96 22
89 0.95 0.95 0.95 22
90 1.00 0.91 0.95 11
91 0.83 0.94 0.88 16
92 0.86 0.92 0.89 13
93 0.94 0.94 0.94 17
94 0.86 0.92 0.89 13
95 0.87 0.93 0.90 14
96 0.93 0.88 0.90 16
97 0.87 0.93 0.90 14
98 0.88 1.00 0.94 15
99 1.00 0.83 0.91 12
100 0.77 1.00 0.87 10
101 1.00 1.00 1.00 14
102 0.87 1.00 0.93 13
103 1.00 0.46 0.63 24
104 0.92 0.92 0.92 12
105 0.81 0.93 0.87 14
106 0.82 0.93 0.87 15
107 0.92 0.92 0.92 13
accuracy 0.92 1856
macro avg 0.91 0.92 0.91 1856
weighted avg 0.92 0.92 0.92 1856
param_grid = {"n_estimators":[100,150,200,250],'max_depth':[6,10,14,20,25]} # defines parameter values for grid search
rand_model3 = RandomForestClassifier() # initiates random forest model
grid = GridSearchCV(rand_model3,param_grid) # sets up grid search
grid.fit(X_train_note,y_train_note) # performs grid search
GridSearchCV(estimator=RandomForestClassifier(),
param_grid={'max_depth': [6, 10, 14, 20, 25],
'n_estimators': [100, 150, 200, 250]})
grid.best_params_ # displays best parameters based on training data
{'max_depth': 25, 'n_estimators': 200}
preds = grid.predict(X_test_note) # predicts pitch
print(classification_report(preds,y_test_note)) # displays classification report random forest - grid
precision recall f1-score support
22 0.67 0.53 0.59 15
23 0.57 0.75 0.65 16
24 0.61 0.82 0.70 17
25 0.84 0.84 0.84 19
26 0.92 0.69 0.79 16
27 0.88 0.96 0.92 24
28 0.80 0.80 0.80 15
29 0.83 0.83 0.83 30
30 1.00 0.81 0.90 16
31 0.90 1.00 0.95 18
32 0.90 0.85 0.88 33
33 0.85 0.81 0.83 27
34 0.88 1.00 0.94 23
35 0.65 0.73 0.69 15
36 0.75 0.65 0.70 23
37 0.93 0.96 0.95 27
38 0.97 0.89 0.93 35
39 0.83 0.92 0.87 26
40 0.96 0.86 0.91 29
41 0.87 0.93 0.90 28
42 0.90 0.90 0.90 29
43 0.96 0.92 0.94 26
44 0.87 0.91 0.89 22
45 0.80 0.91 0.85 22
46 1.00 0.87 0.93 31
47 1.00 0.77 0.87 22
48 0.89 0.85 0.87 20
49 0.82 0.96 0.88 24
50 0.94 0.97 0.95 32
51 1.00 0.97 0.99 35
52 0.94 0.91 0.92 33
53 0.84 1.00 0.91 21
54 0.94 0.97 0.95 31
55 0.97 0.97 0.97 34
56 0.97 0.89 0.93 38
57 0.93 1.00 0.96 26
58 0.84 1.00 0.91 21
59 1.00 0.89 0.94 27
60 0.97 0.89 0.93 36
61 1.00 0.88 0.94 26
62 1.00 1.00 1.00 22
63 1.00 1.00 1.00 28
64 0.96 1.00 0.98 24
65 1.00 1.00 1.00 19
66 0.96 1.00 0.98 23
67 0.95 1.00 0.98 21
68 0.89 1.00 0.94 25
69 1.00 1.00 1.00 18
70 1.00 0.88 0.94 26
71 0.93 0.82 0.87 17
72 0.95 1.00 0.97 19
73 1.00 1.00 1.00 15
74 1.00 1.00 1.00 19
75 1.00 0.96 0.98 24
76 0.96 1.00 0.98 24
77 1.00 1.00 1.00 17
78 1.00 1.00 1.00 19
79 1.00 0.96 0.98 28
80 1.00 0.94 0.97 18
81 1.00 0.96 0.98 24
82 0.94 1.00 0.97 16
83 0.86 0.95 0.90 20
84 1.00 0.93 0.96 27
85 0.88 0.93 0.90 15
86 0.94 0.94 0.94 17
87 1.00 0.95 0.98 22
88 0.92 1.00 0.96 22
89 0.95 1.00 0.98 21
90 1.00 0.83 0.91 12
91 0.83 0.94 0.88 16
92 0.86 0.92 0.89 13
93 0.94 0.94 0.94 17
94 0.86 1.00 0.92 12
95 0.87 1.00 0.93 13
96 1.00 0.83 0.91 18
97 0.80 0.92 0.86 13
98 0.88 1.00 0.94 15
99 1.00 0.91 0.95 11
100 0.85 1.00 0.92 11
101 1.00 1.00 1.00 14
102 0.93 1.00 0.97 14
103 0.91 0.43 0.59 23
104 0.92 0.92 0.92 12
105 0.81 0.93 0.87 14
106 0.82 0.88 0.85 16
107 0.92 0.86 0.89 14
accuracy 0.92 1856
macro avg 0.91 0.91 0.91 1856
weighted avg 0.92 0.92 0.91 1856
X_test_note['Note'] = preds # adds note prediction to test set
X_test_note['Instrument'] = preds_inst # adds instrument prediction to test set
X_test_note.head()
| zero_crossing_rate | zero_crossings | spectrogram | mel_spectrogram | harmonics | perceptual_shock_wave | spectral_centroids | spectral_centroids_delta | spectral_centroids_accelerate | chroma1 | ... | mfcc_delta_17 | mfcc_accelerate_17 | mfcc18 | mfcc_delta_18 | mfcc_accelerate_18 | mfcc19 | mfcc_delta_19 | mfcc_accelerate_19 | Note | Instrument | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7829 | 0.007898 | 581 | -43.719860 | -77.248512 | -1.015214e-05 | 0.000017 | 226.374534 | 5.361960 | 2.498784 | 0.222539 | ... | 0.050393 | -0.021422 | -2.508775 | 0.052334 | -0.014904 | -2.337551 | 0.038754 | -0.007967 | 29 | keyboard |
| 2043 | 0.017844 | 1355 | -58.730682 | -77.676529 | 2.947440e-05 | -0.000027 | 202.495059 | 3.167869 | 1.487649 | 0.008804 | ... | -0.051306 | 0.028265 | -11.960894 | -0.041612 | 0.029439 | -12.619340 | -0.012467 | 0.019915 | 43 | bass |
| 8896 | 0.013378 | 1185 | -58.515682 | -79.649185 | 3.395107e-05 | -0.000033 | 205.533172 | 6.573461 | 2.840984 | 0.501567 | ... | 0.075102 | 0.020115 | -18.010260 | 0.071818 | 0.034119 | -16.802776 | 0.003094 | 0.043800 | 46 | mallet |
| 1830 | 0.003512 | 267 | -15.204553 | -43.861546 | -1.311943e-02 | 0.000019 | 79.184042 | 2.821670 | 5.462707 | 0.746378 | ... | 0.035672 | -0.006857 | 9.487486 | 0.042451 | -0.014337 | 9.742423 | -0.120129 | -0.019815 | 77 | bass |
| 12077 | 0.018680 | 1650 | -36.952751 | -68.855492 | 5.404165e-07 | 0.000111 | 444.422995 | 4.781714 | 2.502548 | 0.067997 | ... | 0.005530 | -0.016946 | -5.744551 | 0.008136 | -0.012692 | -4.648269 | 0.050792 | -0.009209 | 39 | string |
5 rows × 89 columns
keyboard = X_test_note[X_test_note['Instrument'] == 'keyboard'] # filters test set to only include keyboard instrument
staff = keyboard[(keyboard['Note'] < 80) & (keyboard['Note'] > 40)] # filters pitch range to 40-80
from music21 import *
staff.head()
| zero_crossing_rate | zero_crossings | spectrogram | mel_spectrogram | harmonics | perceptual_shock_wave | spectral_centroids | spectral_centroids_delta | spectral_centroids_accelerate | chroma1 | ... | mfcc_delta_17 | mfcc_accelerate_17 | mfcc18 | mfcc_delta_18 | mfcc_accelerate_18 | mfcc19 | mfcc_delta_19 | mfcc_accelerate_19 | Note | Instrument | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7993 | 0.049649 | 3398 | -77.101357 | -79.964249 | -0.000012 | 0.000011 | 672.998898 | 3.216142 | 2.635000 | 0.005937 | ... | 0.116904 | -0.019846 | -14.228360 | 0.085294 | 0.095099 | -13.402055 | 0.019591 | 0.112221 | 70 | keyboard |
| 7721 | 0.075480 | 5210 | -68.871651 | -79.879707 | -0.000010 | -0.000015 | 1049.140166 | -7.853725 | -1.897551 | 0.086783 | ... | -0.096741 | -0.113112 | 30.227087 | -0.146796 | -0.024512 | -12.848338 | 0.271040 | -0.012711 | 68 | keyboard |
| 7744 | 0.159245 | 10981 | -69.856911 | -79.926369 | -0.000006 | -0.000023 | 1805.684493 | -9.533681 | -3.812219 | 0.016535 | ... | -0.039040 | -0.090040 | -12.432023 | 0.062077 | 0.020225 | -31.255440 | 0.103454 | 0.199113 | 75 | keyboard |
| 8263 | 0.024276 | 602 | -62.082813 | -80.000000 | -0.000433 | -0.000147 | 371.681972 | -2.283279 | -1.647439 | 0.051801 | ... | -0.004512 | 0.109875 | -3.915097 | 0.001845 | 0.050360 | -6.831211 | 0.063688 | 0.099692 | 44 | keyboard |
| 8301 | 0.035381 | 932 | -68.732872 | -79.963409 | 0.000023 | -0.000119 | 537.949450 | 4.287896 | 4.590449 | 0.013967 | ... | -0.055562 | -0.217188 | 0.244699 | -0.080286 | -0.031404 | -3.288170 | -0.121592 | 0.036604 | 54 | keyboard |
5 rows × 89 columns
# deines function to convert midi number to note name
NOTES = ['c', 'c#', 'd', 'd#', 'e', 'f', 'f#', 'g', 'g#', 'a', 'a#', 'b']
OCTAVES = list(range(11))
NOTES_IN_OCTAVE = len(NOTES)
def number_to_note(number: int) -> tuple:
octave = number // NOTES_IN_OCTAVE
assert octave in OCTAVES
assert 0 <= number <= 127
note = NOTES[number % NOTES_IN_OCTAVE]
return note+str(octave)
# converts midi number predictions to named notes
notes = '1/4 '
for i in range(len(staff)):
note = number_to_note(staff['Note'].values[i])
notes = notes+' '+note+' '
test_staff = converter.parse("tinyNotation:"+notes) # plots notes on staff
test_staff.show() # displays staff